library(xlsx)
library(data.table)
library(stringr)

## This file generates the "Food_Chem_Categorized.csv" file from the 
## supplementary file 1. 

## Define function to find CASRN
find_cas <- function(x) str_match_all(x, pattern = "[0-9]{1,8}-[0-9]{2}-[0-9]")

## directory to supplementary file 1
path_to_suppfiles <- "" ## NEED TO EDIT TO REFLECT YOUR FILE DIRECTORY
sfile_1 <- file.path(path_to_suppfiles, "SuppFile_1_Chemicals_160322.xlsx")

################################################################################

## load sheet containing inventory of effective food contact substances
efcs <- as.data.table(read.xlsx(file = sfile_1, sheetIndex = 2))
efcs[ , cas_mat := sapply(Food.Contact.Substance, find_cas)]
efcs_cas <- unique(unlist(efcs$cas_mat))

## load sheet containing Indirect Additives Used in Food Contact Substances
ifcs <- as.data.table(read.xlsx(file = sfile_1, sheetIndex = 3, startRow = 3))
ifcs[ , cas_mat := sapply(CAS.Reg.No.or.other.ID, find_cas)]
ifcs_cas <- unique(unlist(ifcs$cas_mat))

## load sheet containing EAFUS Inventory
eafus <- as.data.table(read.xlsx(file = sfile_1, sheetIndex = 4, startRow = 15))
eafus[ , cas_mat := sapply(CASRN.or.Other.Code.., find_cas)]
eafus_cas <- unique(unlist(eafus$cas_mat))

## load sheet containing Threshold of Regulation Exemptions
tor <- as.data.table(read.xlsx(file = sfile_1, sheetIndex = 5))
tor[ , cas_mat := sapply(Food.Contact.Substance, find_cas)]
tor_cas <- unique(unlist(tor$cas_mat))

## load sheet containing FDA SCOGS Inventory
scogs <- as.data.table(read.xlsx(file = sfile_1, sheetIndex = 6, startRow = 3))
scogs[ , cas_mat := sapply(CAS.or.Other.Code, find_cas)]
scogs_cas <- unique(unlist(scogs$cas_mat))

## load sheet containing FDA GRAS Notice Inventory
grn <- as.data.table(read.xlsx(file = sfile_1, sheetIndex = 7))
grn[ , cas_mat := sapply(CAS.Reg..No., find_cas)]
grn_cas <- unique(unlist(grn$cas_mat))

## load sheet containing FDA GRAS Notice Inventory
fmg <- as.data.table(read.xlsx(file = sfile_1, sheetIndex = 8))
fmg[ , cas_mat := sapply(CAS.No., find_cas)]
fmg_cas <- unique(unlist(fmg$cas_mat))

## load sheet containing Alan Wood's Pesticides Inventory
awp <- as.data.table(read.xlsx(file = sfile_1, sheetIndex = 9))
awp[ , cas_mat := sapply(CASRN, find_cas)]
awp_cas <- unique(unlist(awp$cas_mat))

################################################################################

## compile all cas including source annotation
efcs <- cbind(efcs_cas, replicate(length(efcs_cas), "Effective FCS"))
ifcs <- cbind(ifcs_cas, replicate(length(ifcs_cas), "Indirect FCS"))
eafus <- cbind(eafus_cas, replicate(length(eafus_cas), "EAFUS"))
tor <- cbind(tor_cas, replicate(length(tor_cas), "TOR"))
scogs <- cbind(scogs_cas, replicate(length(scogs_cas), "SCOGS"))
grn <- cbind(grn_cas, replicate(length(grn_cas), "GRAS Notice"))
fmg <- cbind(fmg_cas, replicate(length(fmg_cas), "FEMA GRAS"))
awp <- cbind(awp_cas, replicate(length(awp_cas), "Pesticide"))

ALLchems <- rbind(efcs, ifcs, eafus, tor, scogs, grn, fmg, awp)

colnames(ALLchems) <- c("casn", "src")
ALLchems <- as.data.table(ALLchems)

ALLchems[src == "EAFUS" , cat := 1]
ALLchems[src == "SCOGS" , cat := 1]
ALLchems[src == "GRAS Notice" , cat := 1]
ALLchems[src == "FEMA GRAS" , cat := 1]
ALLchems[src == "Indirect FCS", cat := 2]
ALLchems[src == "Effective FCS", cat := 2]
ALLchems[src == "TOR", cat := 2]
ALLchems[src == "Pesticide" , cat := 3]

################################################################################
write.csv(ALLchems, 
          file = file.path("inst", "Food_Chem_Categorized.csv"), 
          row.names = FALSE)
################################################################################

